#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: Prepare Census data for covariates employed in subsequent analyses.

#Load packages
library(tidyverse)
library(janitor)
library(tidycensus)
library(here)

#Define variables from which to download data using TidyCensus
variables <- load_variables(2000, "sf3")
#Sex by age
sex_by_age_out <- subset(variables, concept == "SEX BY AGE [79]")
sex_by_age <- sex_by_age_out$name
#Race
race_out <- subset(variables, concept == "RACE [8]")
race <- race_out$name
#Sex and educational aattainment
sex_by_edu_out <- subset(variables, concept == "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER [35]")
sex_by_edu <- sex_by_edu_out$name
#Create vector of variables to download
get_vars <- c(
  "P001001", #Population
  sex_by_age,
  race,
  sex_by_edu,
  "P087002", #Poverty: total income in 1999 below poverty level
  "P082001", #Household income per capita in 1999
  "P007010", #Total Hispanic
  "P047003", #Total male worked in 1999
  "P047027", #Total female worked in 1999
  "P021013", #Total foreign born
  "P005002", #Total urban
  "P005006", #Total rural (farm)
  "P005007", #Total rural (non-farm)
  "P005005" #Total rural
  )
#Download data from the 2000 decennial Census using Census Summary File 3
census00 <- get_decennial(geography = "county", variables = get_vars, year = 2000, sumfile = "sf3")
#Pivot data from long to wide
census00_wide <- pivot_wider(census00, id_cols = c(GEOID, NAME), names_from = variable, values_from = value)
# Rename variables
census00_wide <- census00_wide %>%
  rename(
    pop = P001001,
    poverty = P087002,
    income99pc = P082001,
    hispanic = P007010,
    work_male = P047003,
    work_female = P047027,
    foreign = P021013,
    urban = P005002,
    rural_farm = P005006,
    rural_nonfarm = P005007,
    rural = P005005,
    fips = GEOID,
    county = NAME
  )
#Add names using the vectors populated earlier
##sex_by_age
names(census00_wide)[4:(3+nrow(sex_by_age_out))] <- sex_by_age_out$label
census00_wide$Total <- NULL
##race
names(census00_wide)[(3+nrow(sex_by_age_out)):(2+nrow(sex_by_age_out)+nrow(race_out))] <- race_out$label
census00_wide$Total <- NULL
##sex_by_edu
names(census00_wide)[89:123] <- sex_by_edu_out$label
census00_wide$Total <- NULL
##aggregate age categories to larger bins
age00 <- census00_wide %>%
  pivot_longer(cols = c(`Total!!Male`:`Total!!Female!!85 years and over`)) %>%
  mutate(name = gsub("Total!!|Male|Female|!!", "", name)) %>%
  dplyr::select(fips, name, value) %>%
  filter(name!="") %>%
  mutate(
    name = case_when(
      name %in% c("Under 1 year", "1 year", paste0(2:18, " years")) ~ "age_under18",
      name %in% c("19 years", "20 years", "21 years", "22 to 24 years","25 to 29 years") ~ "age_19_29",
      name %in% c("30 to 34 years", "35 to 39 years") ~ "age_30_39",
      name %in% c("40 to 44 years", "45 to 49 years") ~ "age_40_49",
      name %in% c("50 to 54 years", "55 to 59 years") ~ "age_50_59",
      name %in% c("60 and 61 years", "62 to 64 years", "65 and 66 years", "67 to 69 years") ~ "age_60_69",
      name %in% c("70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over") ~ "age_over70"
    )
  ) %>%
  group_by(fips, name) %>%
  summarise(value = sum(value)) %>%
  pivot_wider(names_from = name, values_from = value)
#Drop irrelevant variables
census00_wide <- subset(census00_wide, select = -c(`Total!!Male`:`Total!!Female!!85 years and over`))
#Rename race categories
census00_wide <- census00_wide %>%
  rename(
    white = `Total!!White alone`,
    black = `Total!!Black or African American alone`,
    asian = `Total!!Asian alone`,
    pacific = `Total!!Native Hawaiian and Other Pacific Islander alone`,
    indig = `Total!!American Indian and Alaska Native alone`,
    raceother = `Total!!Some other race alone`,
    racemore = `Total!!Two or more races`
  )
#Get education population values
sexedu00 <- census00_wide %>%
  pivot_longer(cols = c(`Total!!Male`:`Total!!Female!!Doctorate degree`)) %>%
  dplyr::select(fips, name, value) %>%
  mutate(name = gsub("Total!!", "", name)) %>%
  filter(name!="Male" & name!="Female") %>%
  separate(name, into = c("sex", "edu"), sep = "!!") %>%
  mutate(
    edu = case_when(
      edu %in% c("No schooling completed", "Nursery to 4th grade", "5th and 6th grade",
                 "7th and 8th grade", "9th grade", "10th grade", "11th grade", "12th grade, no diploma",
                 "High school graduate (includes equivalency)") ~ "edu_hs",
      edu %in% c("Some college, less than 1 year", "Some college, 1 or more years, no degree") ~ "edu_somecollege",
      edu %in% c("Associate degree") ~ "edu_ass",
      edu %in% c("Bachelor's degree") ~ "edu_ba",
      edu %in% c("Master's degree", "Professional school degree", "Doctorate degree") ~ "edu_grad"
    )
  ) %>%
  group_by(fips,sex,edu) %>%
  summarise(value=sum(value)) %>%
  mutate(sex = ifelse(sex=="Female", "f", "m")) %>%
  pivot_wider(names_from=c(edu,sex),values_from=value)
#Drop irrelevant variables
census00_wide <- subset(census00_wide, select = -c(`Total!!Male`:`Total!!Female!!Doctorate degree`))
#Merge data
census00_wide <- left_join(census00_wide, age00, by = "fips")
census00_out <- left_join(census00_wide, sexedu00, by = "fips")

#Adjust FIPS codes so they match ones used in later years
#Oglala Lakota County, SD. Shannon County, SD (FIPS code = 46113)
# renamed Oglala Lakota County and assigned anew
#FIPS code (46102) effective in 2014. Oglala Lakota County has a category code in all three of the NCHS schemes.
oglala <- census00_out[census00_out$fips==46113,]
oglala$fips <- 46102
census00_out <- rbind(census00_out, oglala)
##duplicate miami-dade so that this matches up with the change in fips code in the presidential election data
miami <- census00_out[census00_out$fips==12086,]
miami$fips <- 12025
census00_out <- rbind(census00_out, miami)
#Convert FIPS code to numeric
census00_out$fips <- as.numeric(census00_out$fips)
census00_out$county <- NULL
#Save data
saveRDS(census00_out, here("data", "inter", "census00.rds"))
